# Adapted from github user zygmuntz (https://github.com/zygmuntz/classifying-text)

import numpy as np
import pandas as pd
import csv
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.linear_model import LogisticRegression as LR
from KaggleWord2VecUtility import KaggleWord2VecUtility

LABEL_INDEX = 1
TEXT_INDEX = 0
TEST_FILENAME = 'test_age_related_avg_groundtruth.csv'
TRAIN_FILENAME = 'train_full_reannotated.csv'
OUTPUT_FILENAME = 'outputs.csv'
MODEL_NAME = 'model.p'

#
# Read the data into list of strings and list of labels
#
def read_data(filename):
  with open(filename) as f:
    r = csv.reader(f, dialect='excel')
    text_data = []
    labels = []
    for row in r:
        text_data.append(row[TEXT_INDEX])
    	
        # Sentiment140 train data uses 0 and 4 as labels (0 = negative, 4 = positive)
        # change annotations to be consistent with test set
        i = LABEL_INDEX
        if row[i] == '0':
            labels.append('negative')
        elif row[i] == '4':
            labels.append('positive')
        else:
            if filename == TRAIN_FILENAME: print "UNEXPECTED LABEL"
            labels.append(row[i])
  return text_data, labels


train, train_y = read_data(TRAIN_FILENAME)
test, test_y = read_data(TEST_FILENAME)

print "Parsing train text..."

clean_train_data = []
for data in train:
	clean_train_data.append( " ".join( KaggleWord2VecUtility.review_to_wordlist( data )))

print "Parsing test sentences..."

clean_test_data = []
for data in test:
	clean_test_data.append( " ".join( KaggleWord2VecUtility.review_to_wordlist( data )))


print "Vectorizing train data..."

vectorizer = TfidfVectorizer( max_features = 40000, ngram_range = ( 1, 3 ), 
	sublinear_tf = True )

train_x = vectorizer.fit_transform( clean_train_data )

print "Vectorizing test data..."

test_x = vectorizer.transform( clean_test_data )

print "Training..."

model = LR()

model.fit( train_x, train_y)

pickle.dump(model, open(MODEL_NAME, "wb"))

p = model.predict( test_x )

confidence_vals = model.predict_proba(test_x)

print "Writing results..."


output = pd.DataFrame( data = { "data": test, "class": p } )

f = open(OUTPUT_FILENAME, 'wb')
wr = csv.writer(f)
wr.writerow(["data", "label"])

for i, row in enumerate(p):
    label = "ERROR"
    confidence = confidence_vals[i][0]
    if p[i] == "positive":
        label = "positive"
        confidence = confidence_vals[i][1]
   
    if p[i] == "negative":
        label = "negative"

    out_row = [test[i], label] 
    wr.writerow(out_row)